import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split,\
cross_val_score, KFold, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import metrics
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we would like work on a natural language processing (NLP) project. In doing so, we use a dataset from Kaggle.com.
The dataset contains text from works of fiction written by spooky authors of the public domain: Edgar Allan Poe, HP Lovecraft and Mary Shelley. The data was prepared by chunking larger texts into sentences using CoreNLP's MaxEnt sentence tokenizer, so you may notice the odd non-sentence here and there. Your objective is to accurately identify the author of the sentences in the test set.
Data = pd.read_csv('spooky-author-identification/train.csv')
Data.columns = [x.title().replace('Id','ID') for x in Data]
Pred = pd.read_csv('spooky-author-identification/test.csv')
Pred.columns = [x.title().replace('Id','ID') for x in Pred]
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
Header('Data Dataset:', C = 'Green')
display(Data.head())
display(Data_info(Data))
Header('Pred Dataset:', C = 'Magenta')
display(Pred.head())
display(Data_info(Pred))
Line()
Data Dataset: ======================================================================================
| ID | Text | Author | |
|---|---|---|---|
| 0 | id26305 | This process, however, afforded me no means of... | EAP |
| 1 | id17569 | It never once occurred to me that the fumbling... | HPL |
| 2 | id11008 | In his left hand was a gold snuff box, from wh... | EAP |
| 3 | id27763 | How lovely is spring As we looked from Windsor... | MWS |
| 4 | id12958 | Finding nothing else, not even gold, the Super... | HPL |
| Data Type | Number of NaN Values | Percentage | |
|---|---|---|---|
| ID | object | 0 | 0.0 |
| Text | object | 0 | 0.0 |
| Author | object | 0 | 0.0 |
Pred Dataset: ======================================================================================
| ID | Text | |
|---|---|---|
| 0 | id02310 | Still, as I urged our leaving Ireland with suc... |
| 1 | id24541 | If a fire wanted fanning, it could readily be ... |
| 2 | id00134 | And when they had broken down the frail door t... |
| 3 | id27757 | While I was thinking how I should possibly man... |
| 4 | id04081 | I am not sure to what limit his knowledge may ... |
| Data Type | Number of NaN Values | Percentage | |
|---|---|---|---|
| ID | object | 0 | 0.0 |
| Text | object | 0 | 0.0 |
====================================================================================================
We like to develop a model to recognize/predict the author of a text.
First off, let's define $X$ and $y$ sets. We can use sklearn.preprocessing.LabelEncoder to encode the author from the Data.
X = Data['Text'].values
# Encoding
le_author = LabelEncoder()
y = le_author.fit_transform(Data['Author'].values)
Target = 'Author'
# Map
Header('Author Map:', C = 'Black', T = 'Yellow')
Labels_dict = dict(zip(list(le_author.fit_transform(Data['Author'].unique())),list(Data['Author'].unique())))
display(pd.DataFrame.from_dict(Labels_dict ,orient='index', columns=['Author']).T)
Author Map: ========================================================================================
| 0 | 1 | 2 | |
|---|---|---|---|
| Author | EAP | HPL | MWS |
def DatasetTargetDist(Inp, Target, Labels_dict, PD):
# Table
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
PD = dict(PieColors = ['SeaGreen', 'FireBrick', 'RoyalBlue'], TableColors = ['Navy','White'], hole = .4,
column_widths=[0.5, 0.5],textfont = 14, height = 350, tablecolumnwidth = [0.1, 0.15, 0.15],
pull = [.05, .01 ,.01], legend_title = Target)
DatasetTargetDist(Data, Target, Labels_dict, PD)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Dist(X_train, y_train, X_test, y_test, Labels_dict, PD):
def ToSeries(x):
if not isinstance(x, pd.Series):
Out = pd.Series(x)
else:
Out = x.copy()
return Out
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.02, column_widths= [0.25, 0.3, 0.3],
specs=[[{"type": "table"},{'type':'domain'}, {'type':'domain'}]])
# Right
C = 2
for y in [y_train, y_test]:
Table = ToSeries(y).value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
fig.add_trace(go.Pie(labels= Table[Target], values= Table['Count'], pull=PD['pull'], textfont=dict(size=PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=C)
fig.update_traces(hole= PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
C+=1
# Left
# Table
Table = pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).astype(str)
T = Table.copy()
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.15, 0.25],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(annotations=[dict(text= '<b>' + 'Train<br>Set' + '<b>', x=0.475, y=0.5, font_size=14, showarrow=False),
dict(text= '<b>' + 'Test<br>Set' + '<b>', x=0.86, y=0.5, font_size=14, showarrow=False)])
fig.update_layout(title={'text': '<b>' + 'Train and Test Sets' + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
if not PD['height'] == None:
fig.update_layout(height = PD['height'])
fig.show()
PD = dict(PieColors = ['SeaGreen', 'FireBrick', 'RoyalBlue'], TableColors = ['Navy','White'], hole = .4,
textfont = 12, height = 350, pull = [.05,.01 ,.01], legend_title = Target)
Train_Test_Dist(X_train, y_train, X_test, y_test,Labels_dict ,PD)
Moreover, we would use sklearn.feature_extraction.text.TfidfVectorizer to convert Text data to a matrix of TF-IDF features.
Tfidf_Vec = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
_ = Tfidf_Vec.fit(list(X))
X_TFIDFvec = Tfidf_Vec.transform(X)
X_train_TFIDFvec = Tfidf_Vec.transform(X_train)
X_test_TFIDFvec = Tfidf_Vec.transform(X_test)
An alternative approach would be using sklearn.feature_extraction.text.CountVectorizer to convert Text data to a matrix of token counts.
Count_Vec = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words = 'english')
_ = Count_Vec.fit(list(X))
X_cv = Count_Vec.transform(X)
X_train_cv = Count_Vec.transform(X_train)
X_test_cv = Count_Vec.transform(X_test)
We can use sklearn.linear_model.LogisticRegression (using TF-IDF features).
def Stratified_CV_Scoring(model, X = X, y = y, n_splits = 10):
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
if isinstance(X, pd.DataFrame):
X = X.values
if isinstance(y, pd.Series):
y = y.values
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
_ = model.fit(X_train,y_train)
# Train
y_pred = model.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=Labels, output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = model.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set (CV = % i)' % n_splits})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set (CV = % i)' % n_splits})
return Reports_Train, Reports_Test, CM_Train, CM_Test
def Confusion_Mat(CM_Train, CM_Test, PD, n_splits = 10):
if n_splits == None:
Titles = ['Train Set', 'Test Set']
else:
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'YlGn','Blues', 'PuBu']
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize= PD['FS'])
fig.suptitle(Titles[i], weight = 'bold', fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i], ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": PD['shrink']})
_ = ax[0].set_title('Confusion Matrix');
Temp = np.round(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis], 2)
_ = sns.heatmap(Temp,
annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i+1], ax = ax[1],
linewidths = 0.4, vmin=0, vmax=1, cbar_kws={"shrink": PD['shrink']})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels, rotation=PD['tick_angle'], fontsize = PD['tick_fontsize'])
_ = a.yaxis.set_ticklabels(Labels, rotation=PD['tick_angle'], fontsize = PD['tick_fontsize'])
_ = a.set_aspect(1)
Labels = list(Labels_dict.values())
Header('Logistic Regression with Default Parameters (using TF-IDF features)')
n_splits = 20
logr = LogisticRegression(C = 1.0, max_iter = 200, n_jobs = -1)
print('Default Parameters = %s' % logr.get_params(deep=True))
_ = logr.fit(X_train_TFIDFvec, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(logr, X = X_TFIDFvec, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'RoyalBlue', 'color': 'White'}))
Line()
PD = dict(FS = (12, 6), annot_kws = 14, shrink = .6, tick_angle = 0, tick_fontsize = 11)
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
Logistic Regression with Default Parameters (using TF-IDF features) ================================ Default Parameters = {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 200, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| EAP | 0.9094 ± 0.0022 | 0.9557 ± 0.0018 | 0.9320 ± 0.0012 | 5530.0000 ± 0.0000 |
| HPL | 0.9494 ± 0.0028 | 0.9178 ± 0.0027 | 0.9333 ± 0.0020 | 3944.0000 ± 0.0000 |
| MWS | 0.9508 ± 0.0019 | 0.9170 ± 0.0023 | 0.9336 ± 0.0015 | 4231.0000 ± 0.0000 |
| accuracy | 0.9329 ± 0.0011 | 0.9329 ± 0.0011 | 0.9329 ± 0.0011 | 0.9329 ± 0.0011 |
| macro avg | 0.9366 ± 0.0010 | 0.9302 ± 0.0012 | 0.9330 ± 0.0011 | 13705.0000 ± 0.0000 |
| weighted avg | 0.9337 ± 0.0010 | 0.9329 ± 0.0011 | 0.9329 ± 0.0011 | 13705.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| EAP | 0.7543 ± 0.0057 | 0.8638 ± 0.0059 | 0.8053 ± 0.0044 | 2370.0000 ± 0.0000 |
| HPL | 0.8317 ± 0.0069 | 0.7407 ± 0.0081 | 0.7836 ± 0.0060 | 1691.0000 ± 0.0000 |
| MWS | 0.8371 ± 0.0061 | 0.7636 ± 0.0088 | 0.7986 ± 0.0049 | 1813.0000 ± 0.0000 |
| accuracy | 0.7974 ± 0.0040 | 0.7974 ± 0.0040 | 0.7974 ± 0.0040 | 0.7974 ± 0.0040 |
| macro avg | 0.8077 ± 0.0038 | 0.7893 ± 0.0042 | 0.7958 ± 0.0041 | 5874.0000 ± 0.0000 |
| weighted avg | 0.8021 ± 0.0038 | 0.7974 ± 0.0040 | 0.7970 ± 0.0040 | 5874.0000 ± 0.0000 |
====================================================================================================
Labels = list(Labels_dict.values())
Header('Logistic Regression with Default Parameters (using Count Vectorizer)')
n_splits = 20
logr = LogisticRegression(C = 1.0, max_iter = 200, n_jobs = -1)
print('Default Parameters = %s' % logr.get_params(deep=True))
_ = logr.fit(X_train_cv, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(logr, X = X_cv, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'RoyalBlue', 'color': 'White'}))
Line()
PD = dict(FS = (12, 6), annot_kws = 14, shrink = .6, tick_angle = 0, tick_fontsize = 11)
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
Logistic Regression with Default Parameters (using Count Vectorizer) =============================== Default Parameters = {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 200, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| EAP | 0.9914 ± 0.0006 | 0.9999 ± 0.0001 | 0.9957 ± 0.0003 | 5530.0000 ± 0.0000 |
| HPL | 1.0000 ± 0.0000 | 0.9942 ± 0.0005 | 0.9971 ± 0.0003 | 3944.0000 ± 0.0000 |
| MWS | 0.9998 ± 0.0002 | 0.9941 ± 0.0006 | 0.9969 ± 0.0003 | 4231.0000 ± 0.0000 |
| accuracy | 0.9965 ± 0.0003 | 0.9965 ± 0.0003 | 0.9965 ± 0.0003 | 0.9965 ± 0.0003 |
| macro avg | 0.9971 ± 0.0002 | 0.9961 ± 0.0003 | 0.9966 ± 0.0003 | 13705.0000 ± 0.0000 |
| weighted avg | 0.9965 ± 0.0003 | 0.9965 ± 0.0003 | 0.9965 ± 0.0003 | 13705.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| EAP | 0.7181 ± 0.0056 | 0.8699 ± 0.0063 | 0.7867 ± 0.0046 | 2370.0000 ± 0.0000 |
| HPL | 0.8355 ± 0.0082 | 0.6772 ± 0.0084 | 0.7480 ± 0.0065 | 1691.0000 ± 0.0000 |
| MWS | 0.8128 ± 0.0073 | 0.7316 ± 0.0098 | 0.7700 ± 0.0056 | 1813.0000 ± 0.0000 |
| accuracy | 0.7717 ± 0.0042 | 0.7717 ± 0.0042 | 0.7717 ± 0.0042 | 0.7717 ± 0.0042 |
| macro avg | 0.7888 ± 0.0044 | 0.7596 ± 0.0043 | 0.7682 ± 0.0043 | 5874.0000 ± 0.0000 |
| weighted avg | 0.7811 ± 0.0042 | 0.7717 ± 0.0042 | 0.7704 ± 0.0042 | 5874.0000 ± 0.0000 |
====================================================================================================
Using the model, we can now predict the text available from the Pred (Test) dataset.
X_pred = Count_Vec.transform(Pred.Text.values)
y_pred = logr.predict_proba(X_pred)
Predictions = pd.concat([Pred, pd.DataFrame(y_pred, columns = Labels)], axis =1)
Predictions['Prediction'] = logr.predict(X_pred)
Predictions['Prediction'] = Predictions['Prediction'].replace(Labels_dict)
Predictions.round(4)
| ID | Text | EAP | HPL | MWS | Prediction | |
|---|---|---|---|---|---|---|
| 0 | id02310 | Still, as I urged our leaving Ireland with suc... | 0.1827 | 0.0422 | 0.7751 | MWS |
| 1 | id24541 | If a fire wanted fanning, it could readily be ... | 0.8135 | 0.1664 | 0.0201 | EAP |
| 2 | id00134 | And when they had broken down the frail door t... | 0.4087 | 0.5776 | 0.0137 | HPL |
| 3 | id27757 | While I was thinking how I should possibly man... | 0.8219 | 0.1632 | 0.0149 | EAP |
| 4 | id04081 | I am not sure to what limit his knowledge may ... | 0.8266 | 0.0908 | 0.0826 | EAP |
| ... | ... | ... | ... | ... | ... | ... |
| 8387 | id11749 | All this is now the fitter for my purpose. | 0.6240 | 0.0803 | 0.2957 | EAP |
| 8388 | id10526 | I fixed myself on a wide solitude. | 0.1979 | 0.0566 | 0.7455 | MWS |
| 8389 | id13477 | It is easily understood that what might improv... | 0.9442 | 0.0183 | 0.0375 | EAP |
| 8390 | id13761 | Be this as it may, I now began to feel the ins... | 0.0486 | 0.0105 | 0.9409 | MWS |
| 8391 | id04282 | Long winded, statistical, and drearily genealo... | 0.1657 | 0.8244 | 0.0100 | HPL |
8392 rows × 6 columns